Last updated: 6/30/2023
# importing libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
import seaborn as sns
from itertools import combinations
import scipy.stats as stats
import warnings
import matplotlib.gridspec as gridspec
warnings.filterwarnings("ignore")
# Reading Data
df = pd.read_csv("/Users/juanherrera/Desktop/MacBook Pro/Data Science/Projects/Used Cars Regression Problem/vehicles.csv")
# checking data types
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 426880 entries, 0 to 426879 Data columns (total 26 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 id 426880 non-null int64 1 url 426880 non-null object 2 region 426880 non-null object 3 region_url 426880 non-null object 4 price 426880 non-null int64 5 year 425675 non-null float64 6 manufacturer 409234 non-null object 7 model 421603 non-null object 8 condition 252776 non-null object 9 cylinders 249202 non-null object 10 fuel 423867 non-null object 11 odometer 422480 non-null float64 12 title_status 418638 non-null object 13 transmission 424324 non-null object 14 VIN 265838 non-null object 15 drive 296313 non-null object 16 size 120519 non-null object 17 type 334022 non-null object 18 paint_color 296677 non-null object 19 image_url 426812 non-null object 20 description 426810 non-null object 21 county 0 non-null float64 22 state 426880 non-null object 23 lat 420331 non-null float64 24 long 420331 non-null float64 25 posting_date 426812 non-null object dtypes: float64(5), int64(2), object(19) memory usage: 84.7+ MB
# Dropping columns that are irrelevant to the analysis
columns_to_drop = ['url', 'region_url', 'image_url', 'description',
'id', 'county', 'lat', 'long', 'region', 'model',
'size', 'posting_date']
df.drop(columns_to_drop, axis=1, inplace=True)
# Checking Data
df.tail()
| price | year | manufacturer | condition | cylinders | fuel | odometer | title_status | transmission | VIN | drive | type | paint_color | state | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 426875 | 23590 | 2019.0 | nissan | good | 6 cylinders | gas | 32226.0 | clean | other | 1N4AA6AV6KC367801 | fwd | sedan | NaN | wy |
| 426876 | 30590 | 2020.0 | volvo | good | NaN | gas | 12029.0 | clean | other | 7JR102FKXLG042696 | fwd | sedan | red | wy |
| 426877 | 34990 | 2020.0 | cadillac | good | NaN | diesel | 4174.0 | clean | other | 1GYFZFR46LF088296 | NaN | hatchback | white | wy |
| 426878 | 28990 | 2018.0 | lexus | good | 6 cylinders | gas | 30112.0 | clean | other | 58ABK1GG4JU103853 | fwd | sedan | silver | wy |
| 426879 | 30590 | 2019.0 | bmw | good | NaN | gas | 22716.0 | clean | other | WBA4J1C58KBM14708 | rwd | coupe | NaN | wy |
# Create a figure with a custom grid layout
fig = plt.figure(figsize=(20, 15))
gs = fig.add_gridspec(4, 4)
categorical_variables = list(df.select_dtypes(include=['object']).columns)
categorical_variables.remove('VIN')
position_list = [[0,1], [0,2], [0,3], [1,1], [1,2], [1,3], [2,1], [2,2], [2,3], [3,1], [3,2]]
for i in range(len(categorical_variables)):
ax1 = fig.add_subplot(gs[position_list[i][0], position_list[i][1]])
x = df[categorical_variables[i]].value_counts(normalize=True).head(5).index
y = df[categorical_variables[i]].value_counts(normalize=True).head(5)
ax1.bar(x,y,color='blue')
ax1.set_title(f"{categorical_variables[i]} - {df[categorical_variables[i]].value_counts(normalize=True).shape[0]} unique categories", fontdict={'fontsize': 12})
plt.xticks(rotation=90)
# Adjust spacing between subplots
fig.tight_layout()
# Display the figure
plt.show()
# dropping rows in columns
df.dropna(subset='manufacturer', inplace=True)
df.dropna(subset='fuel', inplace=True)
df.dropna(subset='transmission', inplace=True)
## Imputation technique for some categorical variables
# creating a new label in the conditions column, 'unknown'
df['condition'] = df['condition'].fillna('unknown')
# creating a new label in the title_status column, 'unknown'
df['title_status'] = df['title_status'].fillna('unknown')
# creating a new label in the paint_color column, 'unknown'
df['paint_color'] = df['paint_color'].fillna('unknown')
# creating a new label in the VIN column, 'unknown'
df['VIN'] = df['VIN'].fillna('unknown')
df.loc[(df['VIN'] == 'unknown'), 'VIN'] = 0
df.loc[(df['VIN'] != 0), 'VIN'] = 1
# Manufacturer col
sorted_df_man = df.groupby('manufacturer')['price'].median().sort_values(ascending=False).reset_index()
# creating a new feature for expensive cars, if the car is one of the 10 more expensive it is considered to be expen
df['is_expensive'] = 0
top_expensive_cars_list = sorted_df_man.head(10)['manufacturer'].tolist()
df.loc[df['manufacturer'].isin(top_expensive_cars_list), 'is_expensive'] = 1
top_frequency_cars = df['manufacturer'].value_counts(normalize=True).head(10).index
df.loc[~df['manufacturer'].isin(top_frequency_cars), 'manufacturer'] = 'other'
# Creating binary indicator variables so the model can deal with the missing values
# cylinders col
df['is_missing_cylinders'] = 0
df.loc[df['cylinders'].isnull(), 'is_missing_cylinders'] = 1
# imputing with "unknown" category
df.loc[df['cylinders'].isnull(), 'cylinders'] = 'unknown'
# drive col
df['is_missing_drive'] = 0
df.loc[df['drive'].isnull(), 'is_missing_drive'] = 1
# imputing with "unknown" category
df.loc[df['drive'].isnull(), 'drive'] = 'unknown'
# type col
df['is_missing_type'] = 0
df.loc[df['type'].isnull(), 'is_missing_type'] = 1
# imputing with "unknown" category
df.loc[df['type'].isnull(), 'type'] = 'unknown'
# encoding categorical variables
cat_cols_list = df.select_dtypes(include='object').columns
# Create an instance of LabelEncoder
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
# Fit the LabelEncoder to the categories and transform them
for col in cat_cols_list:
df[col] = label_encoder.fit_transform(df[col])
df.head(1)
| price | year | manufacturer | condition | cylinders | fuel | odometer | title_status | transmission | VIN | drive | type | paint_color | state | is_expensive | is_missing_cylinders | is_missing_drive | is_missing_type | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 27 | 33590 | 2014.0 | 4 | 2 | 6 | 2 | 57923.0 | 0 | 2 | 1 | 3 | 8 | 11 | 1 | 1 | 0 | 1 | 0 |
# We can see there're outliers in the price variable
df[['price', 'year', 'odometer']].describe()
| price | year | odometer | |
|---|---|---|---|
| count | 4.050020e+05 | 404996.000000 | 4.015180e+05 |
| mean | 7.761953e+04 | 2011.491311 | 9.681546e+04 |
| std | 1.250398e+07 | 8.944425 | 2.007469e+05 |
| min | 0.000000e+00 | 1900.000000 | 0.000000e+00 |
| 25% | 5.959000e+03 | 2008.000000 | 3.805000e+04 |
| 50% | 1.398800e+04 | 2014.000000 | 8.590650e+04 |
| 75% | 2.633700e+04 | 2017.000000 | 1.333090e+05 |
| max | 3.736929e+09 | 2022.000000 | 1.000000e+07 |
# Create a figure with a custom grid layout
fig = plt.figure(figsize=(15, 4))
gs = fig.add_gridspec(1, 3)
# First Histogram
ax1 = fig.add_subplot(gs[0, 0])
ax1.hist(df['price'], bins=30, color='blue', alpha=0.5)
ax1.set_title('Price Distribution - High Presence of Outliers (1)', fontdict={'fontsize': 10})
# second Histogram
df = df[df['price'] > 0]
df['price'], lambda_price = stats.boxcox(df['price'])
ax2 = fig.add_subplot(gs[0, 1])
ax2.hist(df['price'], bins=30, color='green', alpha=0.5)
ax2.set_title('Boxcox Transformation - Still Presence of Outliers (2)', fontdict={'fontsize': 10})
# dealing with price outliers by using the interquartile range technique
Q1 = np.percentile(df['price'], 25)
Q3 = np.percentile(df['price'], 75)
IQR = Q3 - Q1
lower_bound = Q3 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
df = df[(df['price'] >= lower_bound) & (df['price'] <= upper_bound)]
# Third Histogram
ax3 = fig.add_subplot(gs[0, 2])
ax3.hist(df['price'], bins=30, color='red', alpha=0.5)
ax3.set_title('Boxcox Transformation - Outliers Removed (3)', fontdict={'fontsize': 10})
# Adjust spacing between subplots
fig.tight_layout()
# Display the figure
plt.show()
# Keeping cars made after 1996 and before 2021
plt.figure(figsize=(14,9))
sns.barplot(x='year', y='price', data=df[(df['year'] > 1996) & (df['year'] < 2021)],estimator=np.mean)
# Rotate x-axis labels
plt.xticks(rotation=90)
# title
plt.title("Average Car Price per Year")
plt.show()
df = df[(df['year'] > 1996) & (df['year'] < 2021)]
# we can still the presence of outliers
plt.figure(figsize=(20,12))
fig = px.box(df, x='year', y='price', title='Box Plot - cars per year')
fig.show()
<Figure size 2000x1200 with 0 Axes>
# Create a figure with a custom grid layout
fig = plt.figure(figsize=(15, 4))
gs = fig.add_gridspec(1, 3)
# First Histogram
ax1 = fig.add_subplot(gs[0, 0])
ax1.hist(df['odometer'], bins=30, color='blue', alpha=0.5)
ax1.set_title('Odometer Distribution - High Presence of Outliers', fontdict={'fontsize': 10})
# second Histogram
df = df[df['odometer'] > 0]
df['odometer'], lambda_odometer = stats.boxcox(df['odometer'])
ax2 = fig.add_subplot(gs[0, 1])
ax2.hist(df['odometer'], bins=30, color='green', alpha=0.5)
ax2.set_title('Boxcox Transformation - Still Presence of Outliers', fontdict={'fontsize': 10})
# dealing with price outliers by using the interquartile range technique
Q1 = np.percentile(df['odometer'], 25)
Q3 = np.percentile(df['odometer'], 75)
IQR = Q3 - Q1
lower_bound = Q3 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
df = df[(df['odometer'] >= lower_bound) & (df['odometer'] <= upper_bound)]
# Third Histogram
ax3 = fig.add_subplot(gs[0, 2])
ax3.hist(df['odometer'], bins=30, color='red', alpha=0.5)
ax3.set_title('Boxcox Transformation - Outliers Removed', fontdict={'fontsize': 10})
# Adjust spacing between subplots
fig.tight_layout()
# Display the figure
plt.show()
# New Feature: car_age
df['car_age'] = 2023 - df['year']
# getting data sample for paiplot visual
sample_df = df.sample(1000)
sns.pairplot(sample_df[['price', 'car_age', 'odometer']])
plt.show()
sns.heatmap(df[['price', 'car_age', 'odometer']].corr(), annot=True)
plt.show()
: There is not a significant relationship between these numerical variables
: There is a significant relationship between these numerical variables
: 0.05. Any P-Value above that treshold indicates no statistical significance
significance_coe_dict = {'v1': [], 'v2': [], 'correlation': [], 'p_value': []}
# numerical variables
variables = ['price', 'year', 'odometer']
variables_comb = list(combinations(variables, 2))
for i in variables_comb:
# Calculate the correlation coefficient and p-value
correlation, p_value = stats.pearsonr(df[i[0]], df[i[1]])
significance_coe_dict['v1'].append(i[0])
significance_coe_dict['v2'].append(i[1])
significance_coe_dict['correlation'].append(correlation)
significance_coe_dict['p_value'].append(p_value)
pd.DataFrame(significance_coe_dict)
| v1 | v2 | correlation | p_value | |
|---|---|---|---|---|
| 0 | price | year | 0.652856 | 0.0 |
| 1 | price | odometer | -0.608504 | 0.0 |
| 2 | year | odometer | -0.648285 | 0.0 |
# all our variables are numeric, have been transformed and cleaned
df.head(2)
| price | year | manufacturer | condition | cylinders | fuel | odometer | title_status | transmission | VIN | drive | type | paint_color | state | is_expensive | is_missing_cylinders | is_missing_drive | is_missing_type | car_age | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 27 | 31.870039 | 2014.0 | 4 | 2 | 6 | 2 | 219.083961 | 0 | 2 | 1 | 3 | 8 | 11 | 1 | 1 | 0 | 1 | 0 | 9.0 |
| 28 | 29.226483 | 2010.0 | 1 | 2 | 6 | 2 | 238.756401 | 0 | 2 | 1 | 3 | 8 | 1 | 1 | 0 | 0 | 1 | 0 | 13.0 |
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
y = df.price
X = df.drop(['price', 'year'], axis=1)
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Fit a model using the training data
X_train = sm.add_constant(X_train) # Add constant term for intercept
model = sm.OLS(y_train, X_train)
result = model.fit()
# Residuals
residuals_ols = result.resid
fittedvalues_ols = result.fittedvalues
print(result.summary())
OLS Regression Results
==============================================================================
Dep. Variable: price R-squared: 0.663
Model: OLS Adj. R-squared: 0.663
Method: Least Squares F-statistic: 2.787e+04
Date: Mon, 03 Jul 2023 Prob (F-statistic): 0.00
Time: 10:56:41 Log-Likelihood: -5.5979e+05
No. Observations: 240464 AIC: 1.120e+06
Df Residuals: 240446 BIC: 1.120e+06
Df Model: 17
Covariance Type: nonrobust
========================================================================================
coef std err t P>|t| [0.025 0.975]
----------------------------------------------------------------------------------------
const 32.7167 0.044 750.259 0.000 32.631 32.802
manufacturer -0.0901 0.002 -53.112 0.000 -0.093 -0.087
condition 0.0923 0.002 40.689 0.000 0.088 0.097
cylinders 1.1051 0.005 201.794 0.000 1.094 1.116
fuel -0.8954 0.007 -123.806 0.000 -0.910 -0.881
odometer -0.0205 0.000 -198.896 0.000 -0.021 -0.020
title_status -0.2479 0.005 -49.720 0.000 -0.258 -0.238
transmission 0.2374 0.009 27.687 0.000 0.221 0.254
VIN 0.8129 0.012 67.238 0.000 0.789 0.837
drive -0.7256 0.008 -85.997 0.000 -0.742 -0.709
type 0.0106 0.001 7.600 0.000 0.008 0.013
paint_color 0.0261 0.001 20.892 0.000 0.024 0.029
state -0.0032 0.000 -9.467 0.000 -0.004 -0.003
is_expensive 1.6782 0.016 105.058 0.000 1.647 1.709
is_missing_cylinders -3.6518 0.022 -164.050 0.000 -3.695 -3.608
is_missing_drive 1.3000 0.023 55.625 0.000 1.254 1.346
is_missing_type -0.1789 0.017 -10.530 0.000 -0.212 -0.146
car_age -0.3637 0.001 -260.216 0.000 -0.366 -0.361
==============================================================================
Omnibus: 15931.440 Durbin-Watson: 2.012
Prob(Omnibus): 0.000 Jarque-Bera (JB): 29894.849
Skew: 0.484 Prob(JB): 0.00
Kurtosis: 4.430 Cond. No. 2.44e+03
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 2.44e+03. This might indicate that there are
strong multicollinearity or other numerical problems.
# Calculate the squared residuals
squared_residuals_ols = np.square(residuals_ols)
# Define the weights as the inverse of the squared residuals
weights = 1 / squared_residuals_ols
#fit weighted least squares regression model
result_wls = sm.WLS(y_train, X_train, weights=weights).fit()
# Residuals
residuals_wls = result_wls.resid
fittedvalues_wls = result_wls.fittedvalues
#view summary of weighted least squares regression model
print(result_wls.summary())
WLS Regression Results
==============================================================================
Dep. Variable: price R-squared: 1.000
Model: WLS Adj. R-squared: 1.000
Method: Least Squares F-statistic: 3.964e+09
Date: Mon, 03 Jul 2023 Prob (F-statistic): 0.00
Time: 10:56:41 Log-Likelihood: -3.9545e+05
No. Observations: 240464 AIC: 7.909e+05
Df Residuals: 240446 BIC: 7.911e+05
Df Model: 17
Covariance Type: nonrobust
========================================================================================
coef std err t P>|t| [0.025 0.975]
----------------------------------------------------------------------------------------
const 32.7165 0.000 6.9e+04 0.000 32.716 32.717
manufacturer -0.0901 1.64e-05 -5481.664 0.000 -0.090 -0.090
condition 0.0924 1.92e-05 4818.655 0.000 0.092 0.092
cylinders 1.1052 5.48e-05 2.02e+04 0.000 1.105 1.105
fuel -0.8954 6.58e-05 -1.36e+04 0.000 -0.896 -0.895
odometer -0.0205 1.14e-06 -1.8e+04 0.000 -0.020 -0.020
title_status -0.2479 3.35e-05 -7392.730 0.000 -0.248 -0.248
transmission 0.2373 6.9e-05 3438.134 0.000 0.237 0.237
VIN 0.8131 0.000 6955.891 0.000 0.813 0.813
drive -0.7255 6.83e-05 -1.06e+04 0.000 -0.726 -0.725
type 0.0106 1.03e-05 1028.201 0.000 0.011 0.011
paint_color 0.0261 9.67e-06 2694.868 0.000 0.026 0.026
state -0.0032 2.43e-06 -1304.938 0.000 -0.003 -0.003
is_expensive 1.6782 0.000 1.11e+04 0.000 1.678 1.678
is_missing_cylinders -3.6522 0.000 -1.68e+04 0.000 -3.653 -3.652
is_missing_drive 1.2999 0.000 8240.714 0.000 1.300 1.300
is_missing_type -0.1787 0.000 -1484.335 0.000 -0.179 -0.178
car_age -0.3637 1.64e-05 -2.22e+04 0.000 -0.364 -0.364
==============================================================================
Omnibus: 823924.374 Durbin-Watson: 2.006
Prob(Omnibus): 0.000 Jarque-Bera (JB): 40070.456
Skew: 0.097 Prob(JB): 0.00
Kurtosis: 1.010 Cond. No. 1.91e+04
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 1.91e+04. This might indicate that there are
strong multicollinearity or other numerical problems.
from sklearn.linear_model import Lasso, Ridge
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Lasso model
lasso_model = Lasso(alpha=0.1)
lasso_model.fit(X_train, y_train)
# Ridge model
ridge_model = Ridge(alpha=0.1)
ridge_model.fit(X_train, y_train)
# Calculate the predictions
lasso_predictions = lasso_model.predict(X_test)
residuals_lasso = y_test - lasso_predictions
lasso_r2 = lasso_model.score(X_test, y_test)
ridge_predictions = ridge_model.predict(X_test)
residuals_ridge = y_test - ridge_predictions
ridge_r2 = ridge_model.score(X_test, y_test)
# Residual distribution of all models
plt.figure(figsize=(6,4))
sns.distplot(residuals_ols, hist = False, kde = True, kde_kws = {'linewidth': 3}, label = 'OLS')
sns.distplot(residuals_wls, hist = False, kde = True, kde_kws = {'linewidth': 3}, label = 'WLS')
sns.distplot(residuals_lasso, hist = False, kde = True, kde_kws = {'linewidth': 3}, label = 'Lasso')
sns.distplot(residuals_ridge, hist = False, kde = True, kde_kws = {'linewidth': 3}, label = 'Ridge')
plt.legend()
plt.show()
fig = plt.figure(figsize=(20, 7))
gs = fig.add_gridspec(2, 4)
# First Plot
ax1 = fig.add_subplot(gs[0, 0])
sm.qqplot(residuals_ols, line='q', ax=ax1)
ax1.set_title(f'OLS fit - r2 = {result.rsquared}', fontdict={'fontsize': 10})
# Second Plot
ax2 = fig.add_subplot(gs[0, 1])
sm.qqplot(residuals_wls, line='q', ax=ax2)
ax2.set_title(f'WLS fit - r2 = {result_wls.rsquared}', fontdict={'fontsize': 10})
# Third Plot
ax3 = fig.add_subplot(gs[1, 0])
sm.qqplot(residuals_lasso, line='q', ax=ax3)
ax3.set_title(f'Lasso fit - r2 = {lasso_r2}', fontdict={'fontsize': 10})
# Fourth Plot
ax4 = fig.add_subplot(gs[1, 1])
sm.qqplot(residuals_ridge, line='q', ax=ax4)
ax4.set_title(f'Ridge fit - r2 = {ridge_r2}', fontdict={'fontsize': 10})
# Adjust spacing between subplots
fig.tight_layout()
# Display the figure
plt.show()
fig = plt.figure(figsize=(20, 7))
gs = fig.add_gridspec(2, 4)
# First Plot
ax1 = fig.add_subplot(gs[0, 0])
fitted = pd.DataFrame({'fitted': fittedvalues_ols, 'residuals': residuals_ols}).sample(1000)['fitted']
residuals = pd.DataFrame({'fitted': fittedvalues_ols, 'residuals': residuals_ols}).sample(1000)['residuals']
ax1.scatter(x=fitted, y=residuals)
ax1.set_title(f'OLS fit - Fitted vs. Residuals', fontdict={'fontsize': 10})
# Second Plot
ax2 = fig.add_subplot(gs[0, 1])
fitted = pd.DataFrame({'fitted': fittedvalues_wls, 'residuals': residuals_wls}).sample(1000)['fitted']
residuals = pd.DataFrame({'fitted': fittedvalues_wls, 'residuals': residuals_wls}).sample(1000)['residuals']
ax2.scatter(x=fitted, y=residuals)
ax2.set_title(f'WLS fit - Fitted vs. Residuals', fontdict={'fontsize': 10})
# Third Plot
ax3 = fig.add_subplot(gs[1, 0])
fitted = pd.DataFrame({'fitted': lasso_predictions, 'residuals': residuals_lasso}).sample(1000)['fitted']
residuals = pd.DataFrame({'fitted': lasso_predictions, 'residuals': residuals_lasso}).sample(1000)['residuals']
ax3.scatter(x=fitted, y=residuals)
ax3.set_title(f'Lasso fit - Fitted vs. Residuals', fontdict={'fontsize': 10})
# Fourth Plot
ax4 = fig.add_subplot(gs[1, 1])
fitted = pd.DataFrame({'fitted': ridge_predictions, 'residuals': residuals_ridge}).sample(1000)['fitted']
residuals = pd.DataFrame({'fitted': ridge_predictions, 'residuals': residuals_ridge}).sample(1000)['residuals']
ax4.scatter(x=fitted, y=residuals)
ax4.set_title(f'Ridge fit - Fitted vs. Residuals', fontdict={'fontsize': 10})
# Adjust spacing between subplots
fig.tight_layout()
# Display the figure
plt.show()
# Coefficients, Data Point, Constant
dat = pd.DataFrame({"coefficients": result.params[1:],
"data_point": X_test.iloc[0]})
dat['constant'] = result.params[:1].values[0]
dat
| coefficients | data_point | constant | |
|---|---|---|---|
| manufacturer | -0.090134 | 1.000000 | 32.716685 |
| condition | 0.092347 | 2.000000 | 32.716685 |
| cylinders | 1.105092 | 5.000000 | 32.716685 |
| fuel | -0.895436 | 2.000000 | 32.716685 |
| odometer | -0.020473 | 281.171828 | 32.716685 |
| title_status | -0.247875 | 0.000000 | 32.716685 |
| transmission | 0.237401 | 2.000000 | 32.716685 |
| VIN | 0.812939 | 1.000000 | 32.716685 |
| drive | -0.725580 | 1.000000 | 32.716685 |
| type | 0.010628 | 0.000000 | 32.716685 |
| paint_color | 0.026072 | 0.000000 | 32.716685 |
| state | -0.003176 | 35.000000 | 32.716685 |
| is_expensive | 1.678191 | 0.000000 | 32.716685 |
| is_missing_cylinders | -3.651846 | 0.000000 | 32.716685 |
| is_missing_drive | 1.299969 | 0.000000 | 32.716685 |
| is_missing_type | -0.178870 | 0.000000 | 32.716685 |
| car_age | -0.363720 | 7.000000 | 32.716685 |
y = 32.7167 1.000000 + (-0.0901) 1.000000 + 0.0923 2.000000 + 1.1051 5.000000 + (-0.8954) 2.000000 + (-0.0205) 281.171828 + (-0.2479) 0.000000 + 0.2374 2.000000 + 0.8129 1.000000 + (-0.7256) 1.000000 + 0.0106 0.000000 + 0.0261 0.000000 + (-0.0032) 35.000000 + 1.6782 0.000000 + (-3.6518) 0.000000 + 1.3000 0.000000 + (-0.1789) 0.000000 + (-0.3637) 7.000000
Result: 28.686077526000012
# Model's prediction
X_test_with_constant = sm.add_constant(X_test)
result.predict(X_test_with_constant.iloc[0])
None 28.694362 dtype: float64
# Actual Result
y_test.iloc[0]
25.570301765409404
# Make predictions on all the testing data
X_test_with_constant = sm.add_constant(X_test)
predictions = result.get_prediction(X_test_with_constant)
predicted_values = predictions.predicted_mean
confidence_intervals = predictions.conf_int(alpha=0.05)
lower_bound = []
upper_bound = []
for i in confidence_intervals:
lower_bound.append(i[0])
upper_bound.append(i[1])
final_df = X_test_with_constant.copy()
final_df['predicted_value'] = lower_bound
final_df['lower_bound_ci'] = lower_bound
final_df['upper_bound_ci'] = upper_bound
from scipy.special import inv_boxcox
final_df['predicted_value'] = inv_boxcox(final_df['predicted_value'], lambda_price)
final_df['lower_bound_ci'] = inv_boxcox(final_df['lower_bound_ci'], lambda_price)
final_df['upper_bound_ci'] = inv_boxcox(final_df['upper_bound_ci'], lambda_price)
final_df['odometer'] = inv_boxcox(final_df['odometer'], lambda_odometer)
sample_final = final_df.sample(100)
plt.scatter(x=sample_final['odometer'], y=sample_final['lower_bound_ci'])
plt.scatter(x=sample_final['odometer'], y=sample_final['predicted_value'])
plt.scatter(x=sample_final['odometer'], y=sample_final['upper_bound_ci'])
plt.show()